{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "from mlxtend.frequent_patterns import apriori, association_rules\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset = pd.read_csv(\"data.csv\",encoding= 'unicode_escape')\n",
    "dataset = pd.read_excel(\"Online Retail.xlsx\")\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Verify missing value\n",
    "dataset.isnull().sum().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Remove missing values\n",
    "dataset1 = dataset.dropna()\n",
    "dataset1.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#selecting data where quantity > 0\n",
    "dataset1= dataset1[dataset1.Quantity > 0]\n",
    "dataset1.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a new feature 'Amount' which is the product of Quantity and its Unit Price\n",
    "dataset1['Amount'] = dataset1['Quantity'] * dataset1['UnitPrice']\n",
    "# to highlight the Customers with most no. of orders (invoices) with groupby function\n",
    "orders = dataset1.groupby(by=['CustomerID','Country'], as_index=False)['InvoiceNo'].count()\n",
    "print('The TOP 5 loyal customers with most number of orders...')\n",
    "orders.sort_values(by='InvoiceNo', ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a subplot of size 15x6\n",
    "plt.subplots(figsize=(15,6))\n",
    "# Using the style bmh for better visualization\n",
    "plt.style.use('bmh')\n",
    "# X axis will denote the customer ID, Y axis will denote the number of orders\n",
    "plt.plot(orders.CustomerID, orders.InvoiceNo)\n",
    "# Labelling the X axis\n",
    "plt.xlabel('Customers ID')\n",
    "# Labelling the Y axis\n",
    "plt.ylabel('Number of Orders')\n",
    "#  Title to the plot\n",
    "plt.title('Number of Orders by different Customers')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Using groupby function to highlight the Customers with highest spent amount (invoices)\n",
    "money = dataset1.groupby(by=['CustomerID','Country'], as_index=False)['Amount'].sum()\n",
    "print('The TOP 5 profitable customers with highest money spent...')\n",
    "money.sort_values(by='Amount', ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a subplot of size 15*6\n",
    "plt.subplots(figsize=(15,6))\n",
    "# X axis will denote the customer ID, Y axis will denote the amount spent\n",
    "plt.plot(money.CustomerID, money.Amount)\n",
    "# Using bmh style for better visualization\n",
    "plt.style.use('bmh')\n",
    "# Labelling the X-axis\n",
    "plt.xlabel('Customers ID')\n",
    "# Labelling the Y-axis\n",
    "plt.ylabel('Money spent')\n",
    "# Giving a suitable title to the plot\n",
    "plt.title('Money Spent by different Customers')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert InvoiceDate from object to datetime\n",
    "dataset1['InvoiceDate'] = pd.to_datetime(dataset.InvoiceDate, format='%m/%d/%Y %H:%M')\n",
    "# Creating a new feature called year_month, such that December 2010 will be denoted as 201012\n",
    "dataset1.insert(loc=2, column='year_month', value=dataset1['InvoiceDate'].map(lambda x: 100*x.year + x.month))\n",
    "# Creating a new feature for Month\n",
    "dataset1.insert(loc=3, column='month', value=dataset1.InvoiceDate.dt.month)\n",
    "# Creating a new feature for Day\n",
    "# +1 to make Monday=1.....until Sunday=7\n",
    "dataset1.insert(loc=4, column='day', value=(dataset1.InvoiceDate.dt.dayofweek)+1)\n",
    "# Creating a new feature for Hour\n",
    "dataset1.insert(loc=5, column='hour', value=dataset1.InvoiceDate.dt.hour)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using bmh style for better visualization\n",
    "plt.style.use('bmh')\n",
    "# Using groupby to extract No. of Invoices year-monthwise\n",
    "ax = dataset1.groupby('InvoiceNo')['year_month'].unique().value_counts().sort_index().plot(kind='bar',figsize=(15,6))\n",
    "# Labelling the X axis\n",
    "ax.set_xlabel('Month',fontsize=15)\n",
    "# Labelling the Y-axis\n",
    "ax.set_ylabel('Number of Orders',fontsize=15)\n",
    "# Giving suitable title to the plot\n",
    "ax.set_title('Number of orders for different Months (Dec 2010 - Dec 2011)',fontsize=15)\n",
    "# Providing with X tick labels\n",
    "ax.set_xticklabels(('Dec_10','Jan_11','Feb_11','Mar_11','Apr_11','May_11','Jun_11','July_11','Aug_11','Sep_11','Oct_11','Nov_11','Dec_11'), rotation='horizontal', fontsize=13)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Day = 6 is Saturday.no orders placed \n",
    "dataset1[dataset1['day']==6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using groupby to count no. of Invoices daywise\n",
    "ax = dataset1.groupby('InvoiceNo')['day'].unique().value_counts().sort_index().plot(kind='bar',figsize=(15,6))\n",
    "# Labelling X axis\n",
    "ax.set_xlabel('Day',fontsize=15)\n",
    "# Labelling Y axis\n",
    "ax.set_ylabel('Number of Orders',fontsize=15)\n",
    "# Giving suitable title to the plot\n",
    "ax.set_title('Number of orders for different Days',fontsize=15)\n",
    "# Providing with X tick labels\n",
    "# Since there are no orders placed on Saturdays, we are excluding Sat from xticklabels\n",
    "ax.set_xticklabels(('Mon','Tue','Wed','Thur','Fri','Sun'), rotation='horizontal', fontsize=15)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using groupby to count the no. of Invoices hourwise\n",
    "ax = dataset1.groupby('InvoiceNo')['hour'].unique().value_counts().iloc[:-2].sort_index().plot(kind='bar',figsize=(15,6))\n",
    "# Labelling X axis\n",
    "ax.set_xlabel('Hour',fontsize=15)\n",
    "# Labelling Y axis\n",
    "ax.set_ylabel('Number of Orders',fontsize=15)\n",
    "# Giving suitable title to the plot\n",
    "ax.set_title('Number of orders for different Hours', fontsize=15)\n",
    "# Providing with X tick lables ( all orders are placed between 6 and 20 hour )\n",
    "ax.set_xticklabels(range(6,21), rotation='horizontal', fontsize=15)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset1.UnitPrice.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# checking the distribution of unit price\n",
    "plt.subplots(figsize=(12,6))\n",
    "# Using darkgrid style for better visualization\n",
    "sns.set_style('darkgrid')\n",
    "# Applying boxplot visualization on Unit Price\n",
    "sns.boxplot(dataset1.UnitPrice)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a new df of free items\n",
    "freeproducts = dataset1[dataset1['UnitPrice'] == 0]\n",
    "freeproducts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Counting how many free items were given out year-month wise\n",
    "freeproducts.year_month.value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Counting how many free items were given out year-month wise\n",
    "ax = freeproducts.year_month.value_counts().sort_index().plot(kind='bar',figsize=(12,6))\n",
    "# Labelling X-axis\n",
    "ax.set_xlabel('Month',fontsize=15)\n",
    "# Labelling Y-axis\n",
    "ax.set_ylabel('Frequency',fontsize=15)\n",
    "# Giving suitable title to the plot\n",
    "ax.set_title('Frequency for different Months (Dec 2010 - Dec 2011)',fontsize=15)\n",
    "# Providing X tick labels\n",
    "# Since there are 0 free items in June 2011, we are excluding it\n",
    "ax.set_xticklabels(('Dec_10','Jan_11','Feb_11','Mar_11','Apr_11','May_11','July_11','Aug_11','Sep_11','Oct_11','Nov_11'), rotation='horizontal', fontsize=13)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.style.use('bmh')\n",
    "# Using groupby to sum the amount spent year-month wise\n",
    "ax = dataset1.groupby('year_month')['Amount'].sum().sort_index().plot(kind='bar',figsize=(15,6))\n",
    "# Labelling X axis\n",
    "ax.set_xlabel('Month',fontsize=15)\n",
    "# Labelling Y axis\n",
    "ax.set_ylabel('Amount',fontsize=15)\n",
    "# Giving suitable title to the plot\n",
    "ax.set_title('Revenue Generated for different Months (Dec 2010 - Dec 2011)',fontsize=15)\n",
    "# Providing with X tick labels\n",
    "ax.set_xticklabels(('Dec_10','Jan_11','Feb_11','Mar_11','Apr_11','May_11','Jun_11','July_11','Aug_11','Sep_11','Oct_11','Nov_11','Dec_11'), rotation='horizontal', fontsize=13)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a new pivot table which sums the Quantity ordered for each item\n",
    "most_sold= dataset1.pivot_table(index=['StockCode','Description'], values='Quantity', aggfunc='sum').sort_values(by='Quantity', ascending=False)\n",
    "most_sold.reset_index(inplace=True)\n",
    "sns.set_style('white')\n",
    "# Creating a bar plot of Description ( or the item ) on the Y axis and the sum of Quantity on the X axis\n",
    "# We are plotting only the 10 most ordered items\n",
    "sns.barplot(y='Description', x='Quantity', data=most_sold.head(10))\n",
    "# Giving suitable title to the plot\n",
    "plt.title('Top 10 Items based on No. of Sales', fontsize=14)\n",
    "plt.ylabel('Item')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# choosing WHITE HANGING HEART T-LIGHT HOLDER as a sample\n",
    "d_white = dataset1[dataset1['Description']=='WHITE HANGING HEART T-LIGHT HOLDER']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# WHITE HANGING HEART T-LIGHT HOLDER has been ordered 2028 times\n",
    "d_white.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# WHITE HANGING HEART T-LIGHT HOLDER has been ordered by 856 customers\n",
    "len(d_white.CustomerID.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a pivot table that displays the sum of unique Customers who bought particular item\n",
    "\n",
    "most_customers = dataset1.pivot_table(index=['StockCode','Description'], values='CustomerID', aggfunc=lambda x: len(x.unique())).sort_values(by='CustomerID', ascending=False)\n",
    "most_customers\n",
    "# Since the count for WHITE HANGING HEART T-LIGHT HOLDER matches above length 856, the pivot table looks correct for all items"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "most_customers.reset_index(inplace=True)\n",
    "sns.set_style('white')\n",
    "# Creating a bar plot of Description ( or the item ) on the Y axis and the sum of unique Customers on the X axis\n",
    "# We are plotting only the 10 most bought items\n",
    "sns.barplot(y='Description', x='CustomerID', data=most_customers.head(10))\n",
    "# Giving suitable title to the plot\n",
    "plt.title('Top 10 Items bought by Most no. of Customers', fontsize=14)\n",
    "plt.ylabel('Item')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Storing all the invoice numbers into a list y\n",
    "y = dataset1['InvoiceNo']\n",
    "y = y.to_list()\n",
    "# Using set function to find unique invoice numbers only and storing them in invoices list\n",
    "invoices = list(set(y))\n",
    "# Creating empty list first_choices\n",
    "firstchoices = []\n",
    "# looping into list of unique invoice numbers\n",
    "for i in invoices:\n",
    "    \n",
    "    # the first item (index = 0) of every invoice is the first purchase\n",
    "    # extracting the item name for the first purchase\n",
    "    firstpurchase = dataset1[dataset1['InvoiceNo']==i]['items'].reset_index(drop=True)[0]\n",
    "    \n",
    "    # Appending the first purchase name into first choices list\n",
    "    firstchoices.append(firstpurchase)\n",
    "firstchoices[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using counter to count repeating first choices\n",
    "count = Counter(firstchoices)\n",
    "# Storing the counter into a datafrane\n",
    "data_first_choices = pd.DataFrame.from_dict(count, orient='index').reset_index()\n",
    "# Rename columns as item and count\n",
    "data_first_choices.rename(columns={'index':'item', 0:'count'},inplace=True)\n",
    "# Sorting the data based on count\n",
    "data_first_choices.sort_values(by='count',ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplots(figsize=(20,10))\n",
    "sns.set_style('white')\n",
    "# Creating a bar plot that displays Item name on the Y axis and Count on the X axis\n",
    "sns.barplot(y='item', x='count', data=data_first_choices.sort_values(by='count',ascending=False).head(10))\n",
    "# Giving suitable title to the plot\n",
    "plt.title('Top 10 First Choices', fontsize=14)\n",
    "plt.ylabel('Item')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "basket = (dataset1.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))\n",
    "basket.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def encode_u(x):\n",
    "    if x < 1:\n",
    "        return 0\n",
    "    if x >= 1:\n",
    "        return 1\n",
    "\n",
    "basket = basket.applymap(encode_u)\n",
    "# everything is encoded into 0 and 1\n",
    "basket.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# trying out on a sample item\n",
    "wooden_star = basket.loc[basket['WOODEN STAR CHRISTMAS SCANDINAVIAN']==1]\n",
    "# Using apriori algorithm, creating association rules for the sample item\n",
    "# Applying apriori algorithm for wooden_star\n",
    "frequentitemsets = apriori(wooden_star, min_support=0.15, use_colnames=True)\n",
    "# Storing the association rules into rules\n",
    "wooden_star_rules = association_rules(frequentitemsets, metric=\"lift\", min_threshold=1)\n",
    "# Sorting the rules on lift and support\n",
    "wooden_star_rules.sort_values(['lift','support'],ascending=False).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# In other words, it returns the items which are likely to be bought by user because he bought the item passed into function\n",
    "def frequently_bought_t(item):\n",
    "    # df of item passed\n",
    "    item_d = basket.loc[basket[item]==1]\n",
    "    # Applying apriori algorithm on item df\n",
    "    frequentitemsets = apriori(item_d, min_support=0.15, use_colnames=True)\n",
    "    # Storing association rules\n",
    "    rules = association_rules(frequentitemsets, metric=\"lift\", min_threshold=1)\n",
    "    # Sorting on lift and support\n",
    "    rules.sort_values(['lift','support'],ascending=False).reset_index(drop=True)\n",
    "    print('Items frequently bought together with {0}'.format(item))\n",
    "    # Returning top 6 items with highest lift and support\n",
    "    return rules['consequents'].unique()[:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frequently_bought_t('WOODEN STAR CHRISTMAS SCANDINAVIAN')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frequently_bought_t('JAM MAKING SET WITH JARS')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "777490da48e046e3b512f0b24bf037db286a787493a11bf82a9e0f2cbf21bb67"
  },
  "kernelspec": {
   "display_name": "Python 3.8.7 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
